This dataset originates from a nutrigenomic study in mice (Martin et al., 2007), designed to investigate how dietary fatty acid composition influences liver lipid profiles and hepatic gene expression.
Experimental Design
Forty mice were cross-classified according to two experimental factors, each with multiple levels:
Genotype (2 levels):
Wild-type (WT)
PPARα knockout (PPARα -/-)
Diet (5 levels):
REF: Reference diet (50/50 corn and colza oils)
COC: Saturated fatty acid-rich diet (hydrogenated coconut oil)
SUN: Omega-6-rich diet (sunflower oil)
LIN: Omega-3-rich diet (linseed oil)
FISH: Mixed diet (corn/colza/enriched fish oils in 43/43/14 ratio)
Each genotype-diet combination includes four biological replicates.
Variables Collected
Two sets of variables were measured for each mouse:
Expression levels of 120 selected hepatic genes, chosen from ~30,000
candidates for their relevance to nutritional regulation.
Measured using nylon macroarrays with radioactive labeling.
Relative concentrations (percentages) of 21 hepatic fatty
acids.
Quantified via gas chromatography.
data("nutrimouse")
genes <- nutrimouse$gene
lipids <- nutrimouse$lipid
metadata <- data.frame(genotype = nutrimouse$genotype, diet = nutrimouse$diet)
metadata$sample_name <- paste0(rownames(metadata), "_", metadata$genotype, "_", metadata$diet)
rownames(genes) <- metadata$sample_name
rownames(lipids) <- metadata$sample_name
Visualize the distribution of gene expression values, to ensure comparability across genes and preparing for multivariate analyses.
genes.melt <- melt(nutrimouse$gene)
ggplot(genes.melt, aes(x=value, col=variable)) +
geom_density() +
guides(col=F)
scaled.genes.melt <- melt(scale(nutrimouse$gene))
scaled.genes.melt <- scaled.genes.melt[,-1]
colnames(scaled.genes.melt) <- c("variable", "value")
ggplot(scaled.genes.melt, aes(x=value, col=variable)) +
geom_density() +
guides(col=F)
Explore structure and variability in genes block.
# run analysis
PCA_genes_res <- opls(x = nutrimouse$gene,
predI = 9)
## PCA
## 40 samples x 120 variables
## standard scaling of predictors
## R2X(cum) pre ort
## Total 0.823 9 0
# saliences
variances_genes <- data.frame(variance = PCA_genes_res@pcaVarVn)
variances_genes$Dim <- rownames(variances_genes)
ggplot(variances_genes, aes(x=Dim, y=variance)) +
geom_bar(stat = "identity") +
theme_light() +
labs(x = "principal components",
y = "percentage of explained variance",
title = "scree plot")
scores_genes <- data.frame(metadata, PCA_genes_res@scoreMN)
ggplot(scores_genes, aes(x=p1, y=p2, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.1 - ", PCA_genes_res@modelDF$R2X[1]),
y=paste0("Dim.2 - ", PCA_genes_res@modelDF$R2X[2]),
title = "scores plots on Dim.1 vs Dim.2") +
theme_light()
ggplot(scores_genes, aes(x=p3, y=p4, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.3 - ", PCA_genes_res@modelDF$R2X[3]),
y=paste0("Dim.4 - ", PCA_genes_res@modelDF$R2X[4]),
title = "scores plots on Dim.3 vs Dim.4") +
theme_light()
loadings_genes <- data.frame(PCA_genes_res@loadingMN)
loadings_genes$variable <- rownames(loadings_genes)
ggplot(loadings_genes, aes(x=p1, y=p2, label=variable)) +
geom_point() +
geom_text_repel() +
labs(x=paste0("Dim.1 - ", PCA_genes_res@modelDF$R2X[1]),
y=paste0("Dim.2 - ", PCA_genes_res@modelDF$R2X[2]),
title = "loadings plots on Dim.1 Dim.2") +
theme_light()
ggplot(loadings_genes, aes(x=p3, y=p4, label=variable)) +
geom_point() +
geom_text_repel() +
labs(x=paste0("Dim.3 - ", PCA_genes_res@modelDF$R2X[3]),
y=paste0("Dim.4 - ", PCA_genes_res@modelDF$R2X[4]),
title = "loadings plots on Dim.3 Dim.4") +
theme_light()
Visualize the distribution of gene expression values, to ensure comparability across lipids and preparing for multivariate analyses.
lipid.melt <- melt(nutrimouse$lipid)
ggplot(lipid.melt, aes(x=value, col=variable)) +
geom_density() +
guides(col=F)
scaled.lipid.melt <- melt(scale(nutrimouse$lipid))
scaled.lipid.melt <- scaled.lipid.melt[,-1]
colnames(scaled.lipid.melt) <- c("variable", "value")
ggplot(scaled.lipid.melt, aes(x=value, col=variable)) +
geom_density() +
guides(col=F)
Explore structure and variability in lipids block.
# run analysis
PCA_lipids_res <- opls(x = nutrimouse$lipid,
predI = 9,
crossvalI = 7,
permI = 100)
## PCA
## 40 samples x 21 variables
## standard scaling of predictors
## R2X(cum) pre ort
## Total 0.974 9 0
# saliences
variances_lipids <- data.frame(variance = PCA_lipids_res@modelDF$R2X)
variances_lipids$Dim <- rownames(variances_lipids)
ggplot(variances_lipids, aes(x=Dim, y=variance)) +
geom_bar(stat = "identity") +
theme_light() +
labs(x = "principal components",
y = "percentage of explained variance",
title = "scree plot")
scores_lipids <- data.frame(metadata, PCA_lipids_res@scoreMN)
ggplot(scores_lipids, aes(x=p1, y=p2, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.1 - ", PCA_lipids_res@modelDF$R2X[1]),
y=paste0("Dim.2 - ", PCA_lipids_res@modelDF$R2X[2]),
title = "scores plots on Dim.1 vs Dim.2") +
theme_light()
ggplot(scores_lipids, aes(x=p3, y=p4, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.3 - ", PCA_lipids_res@modelDF$R2X[3]),
y=paste0("Dim.4 - ", PCA_lipids_res@modelDF$R2X[4]),
title = "scores plots on Dim.3 vs Dim.4") +
theme_light()
loadings_lipids <- data.frame(PCA_lipids_res@loadingMN)
loadings_lipids$variable <- rownames(loadings_lipids)
ggplot(loadings_lipids, aes(x=p1, y=p2, label=variable)) +
geom_point() +
geom_text_repel() +
labs(x=paste0("Dim.1 - ", PCA_lipids_res@modelDF$R2X[1]),
y=paste0("Dim.2 - ", PCA_lipids_res@modelDF$R2X[2]),
title = "loadings plots on Dim.1 Dim.2") +
theme_light()
ggplot(loadings_lipids, aes(x=p3, y=p4, label=variable)) +
geom_point() +
geom_text_repel() +
labs(x=paste0("Dim.3 - ", PCA_lipids_res@modelDF$R2X[3]),
y=paste0("Dim.4 - ", PCA_lipids_res@modelDF$R2X[4]),
title = "loadings plots on Dim.3 Dim.4") +
theme_light()
library(mixOmics)
# run analysis
PLS_cano_res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="canonical")
PLS_cano_scores_genes <- data.frame(metadata, PLS_cano_res$variates$X)
ggplot(PLS_cano_scores_genes, aes(x=comp1, y=comp2, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.1 - ", round(PLS_cano_res$prop_expl_var$X["comp1"], 2)),
y=paste0("Dim.2 - ", round(PLS_cano_res$prop_expl_var$X["comp2"], 2)),
title = "scores plots on Dim.1 vs Dim.2 - genes") +
theme_light()
PLS_cano_scores_lipids <- data.frame(metadata, PLS_cano_res$variates$Y)
ggplot(PLS_cano_scores_lipids, aes(x=comp1, y=comp2, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.1 - ", round(PLS_cano_res$prop_expl_var$Y["comp1"], 2)),
y=paste0("Dim.2 - ", round(PLS_cano_res$prop_expl_var$Y["comp2"], 2)),
title = "scores plots on Dim.1 vs Dim.2 - lipids") +
theme_light()
# or with function from mixomics
plotIndiv(PLS_cano_res)
PLS_cano_loadings_genes <- data.frame(PLS_cano_res$loadings.star[[1]])
PLS_cano_loadings_genes$variable <- rownames(PLS_cano_loadings_genes)
ggplot(PLS_cano_loadings_genes, aes(x=X1, y=X2, label=variable)) +
geom_point() +
geom_text_repel() +
labs(x=paste0("Dim.1 - ", round(PLS_cano_res$prop_expl_var$X["comp1"], 2)),
y=paste0("Dim.2 - ", round(PLS_cano_res$prop_expl_var$X["comp2"], 2)),
title = "loadings plots on Dim.1 Dim.2 - genes") +
theme_light()
PLS_cano_loadings_lipids <- data.frame(PLS_cano_res$loadings.star[[2]])
PLS_cano_loadings_lipids$variable <- rownames(PLS_cano_loadings_lipids)
ggplot(PLS_cano_loadings_lipids, aes(x=X1, y=X2, label=variable)) +
geom_point() +
geom_text_repel() +
labs(x=paste0("Dim.1 - ", round(PLS_cano_res$prop_expl_var$Y["comp1"], 2)),
y=paste0("Dim.2 - ", round(PLS_cano_res$prop_expl_var$Y["comp2"], 2)),
title = "loadings plots on Dim.1 Dim.2 - lipids") +
theme_light()
PLS_reg_res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="regression")
PLS_reg_scores_genes <- data.frame(metadata, PLS_reg_res$variates$X)
ggplot(PLS_reg_scores_genes, aes(x=comp1, y=comp2, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.1 - ", round(PLS_reg_res$prop_expl_var$X["comp1"], 2)),
y=paste0("Dim.2 - ", round(PLS_reg_res$prop_expl_var$X["comp2"], 2)),
title = "regression analysis - scores plots on Dim.1 vs Dim.2 - genes") +
theme_light()
ggplot(PLS_cano_scores_genes, aes(x=comp1, y=comp2, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.1 - ", round(PLS_cano_res$prop_expl_var$X["comp1"], 2)),
y=paste0("Dim.2 - ", round(PLS_cano_res$prop_expl_var$X["comp2"], 2)),
title = "canonical analysis - scores plots on Dim.1 vs Dim.2 - genes") +
theme_light()
PLS_reg_scores_lipids <- data.frame(metadata, PLS_reg_res$variates$Y)
ggplot(PLS_cano_scores_lipids, aes(x=comp1, y=comp2, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.1 - ", round(PLS_cano_res$prop_expl_var$Y["comp1"], 2)),
y=paste0("Dim.2 - ", round(PLS_cano_res$prop_expl_var$Y["comp2"], 2)),title = "regression analysis - scores plots on Dim.1 vs Dim.2 - lipids") +
theme_light()
ggplot(PLS_reg_scores_lipids, aes(x=comp1, y=comp2, col=diet, shape = genotype)) +
geom_point(size=4) +
labs(x=paste0("Dim.1 - ", round(PLS_reg_res$prop_expl_var$Y["comp1"], 2)),
y=paste0("Dim.2 - ", round(PLS_reg_res$prop_expl_var$Y["comp2"], 2)),
title = "canonical analysis - scores plots on Dim.1 vs Dim.2 - lipids") +
theme_light()
Discriminate genotypes using gene expression
PLSDA_genes_genotype <- opls(x = nutrimouse$gene,
y = metadata$genotype,
predI = NA,
permI = 100)
## PLS-DA
## 40 samples x 120 variables and 1 response
## standard scaling of predictors and response(s)
## R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y pQ2
## Total 0.158 0.916 0.879 0.149 1 0 0.01 0.01
Observe samples distribution
- plot scores on Dim.1 as a bar plot
scores_genes_genotype <- data.frame(metadata, PLSDA_genes_genotype@scoreMN)
ggplot(scores_genes_genotype, aes(x=sample_name, y=p1, fill=genotype, col = diet)) +
geom_bar(stat = "identity") +
labs(title = "scores plots on Dim.1") +
theme_light() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Interpret variables contribution
- plot loadings on Dim.1 as a bar plot
loadings_genes_genotype <- data.frame(PLSDA_genes_genotype@loadingMN)
loadings_genes_genotype$variable <- rownames(loadings_genes_genotype)
loadings_genes_genotype <- loadings_genes_genotype[order(loadings_genes_genotype$p1),]
loadings_genes_genotype$rank <- seq(1,nrow(loadings_genes_genotype))
ggplot(loadings_genes_genotype, aes(x=rank, y=p1, label=variable)) +
geom_bar(stat = "identity") +
geom_text(angle=90, size=1.5, hjust=ifelse(loadings_genes_genotype$p1 <0, 1,0)) +
labs(title = "loadings plots on Dim.1") +
theme_light()
VIP_genes_genotype <- data.frame(VIP = PLSDA_genes_genotype@vipVn)
VIP_genes_genotype$variable <- rownames(VIP_genes_genotype)
VIP_genes_genotype <- VIP_genes_genotype[order(VIP_genes_genotype$VIP, decreasing = T),]
VIP_genes_genotype$rank <- seq(1,nrow(loadings_genes_genotype))
ggplot(VIP_genes_genotype, aes(x=rank, y=VIP, label=variable)) +
geom_bar(stat = "identity") +
geom_text(angle=90, size=1.5, hjust=0) +
labs(title = "VIP plot on Dim.1") +
theme_light()
loadings_VIP_genes_genotype <- merge(loadings_genes_genotype[,-3], VIP_genes_genotype[,-3])
ggplot(loadings_VIP_genes_genotype, aes(x=p1, y=VIP, label=variable)) +
geom_point() +
geom_text_repel(size=3) +
labs(title = "loadings vs VIP plot - Dim.1") +
theme_light()
Discriminate genotypes using gene expression
OPLSDA_genes_genotype <- opls(x = nutrimouse$gene,
y = metadata$genotype,
predI = 1,
orthoI = 1,
permI = 100)
## OPLS-DA
## 40 samples x 120 variables and 1 response
## standard scaling of predictors and response(s)
## R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y pQ2
## Total 0.38 0.951 0.892 0.115 1 1 0.01 0.01
Observe samples distribution
- plot scores on Predictive vs Orthogonal components
oplsda_scores_genes_genotype <- data.frame(metadata, p1= OPLSDA_genes_genotype@scoreMN, o1=OPLSDA_genes_genotype@orthoScoreMN)
ggplot(oplsda_scores_genes_genotype, aes(x=p1, y=o1, shape=genotype, col = diet)) +
geom_point(size=4) +
labs(title = "scores plots Pred vs Ortho") +
theme_light() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Identify most discriminant genes
- plot loadings on Predictive vs Orthogonal components
oplsda_loadings_genes_genotype <- data.frame(p1=OPLSDA_genes_genotype@loadingMN, o1=OPLSDA_genes_genotype@orthoLoadingMN)
oplsda_loadings_genes_genotype$variable <- rownames(oplsda_loadings_genes_genotype)
ggplot(oplsda_loadings_genes_genotype, aes(x=p1, y=o1, label=variable)) +
geom_point() +
geom_text_repel() +
labs(title = "loadings plots on Pred vs Ortho") +
theme_light()
oplsda_loadings_VIP_genes_genotype <- data.frame(oplsda_loadings_genes_genotype, VIP = OPLSDA_genes_genotype@vipVn)
ggplot(oplsda_loadings_VIP_genes_genotype, aes(x=p1, y=VIP, label=variable)) +
geom_point() +
geom_text_repel(size=2, max.overlaps = 25, segment.size=.2) +
labs(title = "loadings vs VIP plot - Dim.1") +
theme_light()